Keras tuned Gradient BoostedTrees Model regression model

with house price data

1 package

1.1 download package

Code

import os
os.system('pip install tensorflow_decision_forests')

1.2 load package

Code

import os
# Keep using Keras 2
os.environ['TF_USE_LEGACY_KERAS'] = '1'

import tensorflow_decision_forests as tfdf

import numpy as np
import pandas as pd
import tensorflow as tf
import tf_keras
import math

Code

# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

Found TensorFlow Decision Forests v1.9.0

2 data

data download form kaggle

2.1 read data

Code

train_file_path = "data/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (1460, 81)

Code

dataset_df.head(3)

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500

3 rows × 81 columns

Code

dataset_df = dataset_df.drop('Id', axis=1)
dataset_df.head(3)

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	...	NaN	NaN	NaN	9	2008	WD	Normal	223500

3 rows × 80 columns

Code

#dataset_df.info()

2.2 data pre

Code

import numpy as np
def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

1006 examples in training, 454 examples in testing.

Code

label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

3 model

3.1 define tuning model with

Code

# A good template of hyper-parameters. A configuration that is generally better than the default parameters without being more expensive
model_8 = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION
                                                ,hyperparameter_template="better_default"
                                                )

Resolve hyper-parameter template "better_default" to "better_default@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL'}.
Use /var/folders/v3/pzt9c47n1nbcsmybsg_w0lhw0000gn/T/tmpm1wep0fc as temporary training directory

3.2 compile model

Code

model_8.compile(metrics=["mse"])

3.3 Train the model

Code

model_8.fit(train_ds)

Reading training dataset...
Training dataset read in 0:00:01.657773. Found 1006 examples.
Training model...
Model trained in 0:00:00.829118
Compiling model...
Model compiled.

<tf_keras.src.callbacks.History at 0x28502f4d0>

Code

evaluation = model_8.evaluate(valid_ds, return_dict=True)
print()

1/1 [==============================] - ETA: 0s - loss: 0.0000e+00 - mse: 1009853504.00001/1 [==============================] - 2s 2s/step - loss: 0.0000e+00 - mse: 1009853504.0000

3.4 Evaluate the model

Code

evaluation = model_8.evaluate(valid_ds, return_dict=True)
print()

1/1 [==============================] - ETA: 0s - loss: 0.0000e+00 - mse: 1009853504.00001/1 [==============================] - 0s 147ms/step - loss: 0.0000e+00 - mse: 1009853504.0000

Code

for name, value in evaluation.items():
  mse=value

RMSE

Code

import math
math.sqrt(mse)

31778.192270801057

Code

import matplotlib.pyplot as plt
logs = model_8.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

4 reference:

https://colab.research.google.com/github/tensorflow/decision-forests/blob/main/documentation/tutorials/beginner_colab.ipynb#scrollTo=xUy4ULEMtDXB